[Proposed] 불균형 데이터(확률값 넣어서 auc값 해보기)

Author

김보람

Published

February 4, 2024

imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import xgboost as xgb

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/torch_geometric/typing.py:18: UserWarning: An issue occurred while importing 'pyg-lib'. Disabling its usage. Stacktrace: /home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/libpyg.so: undefined symbol: _ZN2at4_ops12split_Tensor4callERKNS_6TensorEN3c106SymIntEl
  warnings.warn(f"An issue occurred while importing 'pyg-lib'. "
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/torch_geometric/typing.py:31: UserWarning: An issue occurred while importing 'torch-scatter'. Disabling its usage. Stacktrace: /home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/torch_scatter/_scatter_cuda.so: undefined symbol: _ZNK3c107SymBool10guard_boolEPKcl
  warnings.warn(f"An issue occurred while importing 'torch-scatter'. "
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/torch_geometric/typing.py:42: UserWarning: An issue occurred while importing 'torch-sparse'. Disabling its usage. Stacktrace: /home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/torch_sparse/_diag_cuda.so: undefined symbol: _ZN3c106detail19maybe_wrap_dim_slowIlEET_S2_S2_b
  warnings.warn(f"An issue occurred while importing 'torch-sparse'. "
def throw(df, fraud_rate):  # 사기 거래 비율에 맞춰 버려지는 함수!
    df1 = df[df['is_fraud'] == 1].copy()
    df0 = df[df['is_fraud'] == 0].copy()
    df0_downsample = (len(df1) * (1-fraud_rate)) / (len(df0) * fraud_rate)
    df0_down = df0.sample(frac=df0_downsample, random_state=42)
    df_p = pd.concat([df1, df0_down])
    return df_p

def split_dataframe(data_frame, test_fraud_rate, test_rate=0.3):
    n = len(data_frame)

    # 사기 거래와 정상 거래를 분리
    fraud_data = data_frame[data_frame['is_fraud'] == 1]
    normal_data = data_frame[data_frame['is_fraud'] == 0]

    # 테스트 데이터 크기 계산
    test_samples = int(test_fraud_rate * (n * test_rate))
    remaining_test_samples = int(n * test_rate) - test_samples

    # 사기 거래 및 정상 거래에서 무작위로 테스트 데이터 추출
    test_fraud_data = fraud_data.sample(n=test_samples, replace=False)
    test_normal_data = normal_data.sample(n=remaining_test_samples, replace=False)

    # 테스트 데이터 합치기
    test_data = pd.concat([test_normal_data, test_fraud_data])

    # 훈련 데이터 생성
    train_data = data_frame[~data_frame.index.isin(test_data.index)]

    return train_data, test_data

def concat(df_tr, df_tst):   
    df = pd.concat([df_tr, df_tst])
    train_mask = np.concatenate((np.full(len(df_tr), True), np.full(len(df_tst), False)))    # index꼬이는거 방지하기 위해서? ★ (이거,, 훔,,?(
    test_mask =  np.concatenate((np.full(len(df_tr), False), np.full(len(df_tst), True))) 
    mask = (train_mask, test_mask)
    return df, mask

def evaluation(y, yhat):
    metrics = [sklearn.metrics.accuracy_score,
               sklearn.metrics.precision_score,
               sklearn.metrics.recall_score,
               sklearn.metrics.f1_score,
               sklearn.metrics.roc_auc_score]
    return pd.DataFrame({m.__name__:[m(y,yhat).round(6)] for m in metrics})

def compute_time_difference(group):
    n = len(group)
    result = []
    for i in range(n):
        for j in range(n):
            time_difference = abs((group.iloc[i].trans_date_trans_time - group.iloc[j].trans_date_trans_time).total_seconds())
            result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
    return result

def edge_index(df, unique_col, theta, gamma):
    groups = df.groupby(unique_col)
    edge_index = np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist])
    edge_index = edge_index.astype(np.float64)
    # filename = f"edge_index{str(unique_col).replace(' ', '').replace('_', '')}.npy"  # 저장
    # np.save(filename, edge_index)
    edge_index[:,2] = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist()
    edge_index = torch.tensor([(int(row[0]), int(row[1])) for row in edge_index if row[2] > gamma], dtype=torch.long).t()
    return edge_index



def gcn_data(df):
    x = torch.tensor(df['amt'].values, dtype=torch.float).reshape(-1,1)
    y = torch.tensor(df['is_fraud'].values,dtype=torch.int64)
    data = torch_geometric.data.Data(x=x, edge_index = edge_index, y=y, train_mask = mask[0], test_mask= mask[1])
    return data
class GCN1(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(1, 32)
        self.conv2 = GCNConv(32, 2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

def train_and_evaluate_model(data, model, optimizer, num_epochs=400):
    model.train()
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        out = model(data)
        loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
        loss.backward()
        optimizer.step()
    
    model.eval()
    pred = model(data).argmax(dim=1)
    yyhat = pred[data.test_mask]
    
    return yyhat

# # 모델과 옵티마이저 생성
# model = GCN1()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# # 함수 호출
# yyhat = train_and_evaluate_model(data, model, optimizer)
import pickle 
with open('fraudTrain.pkl', 'rb') as file:
    fraudTrain = pickle.load(file)    

(throw 0.3 /split 0.05)

df = throw(fraudTrain, 0.3)
df_tr, df_tst = split_dataframe(df, 0.05)
df2, mask = concat(df_tr, df_tst)
df2['index'] = df2.index
df3 = df2.reset_index()

시도1

edge_index = edge_index(df3,'cc_num', 8.028000e+04, 0.3)
data = gcn_data(df3)
model = GCN1()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
yy = (data.y[data.test_mask]).numpy()
model.train()
for epoch in range(400):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

model.eval()
pred = model(data).argmax(dim=1)
yyhat = pred[data.test_mask]
model(data)
tensor([[-3.4273, -0.0330],
        [-0.3912, -1.1277],
        [-0.3912, -1.1277],
        ...,
        [-5.0603, -0.0064],
        [-1.6619, -0.2104],
        [-3.6349, -0.0267]], grad_fn=<LogSoftmaxBackward0>)
pred
tensor([1, 0, 0,  ..., 1, 1, 1])
yyhat
tensor([0, 0, 0,  ..., 1, 1, 1])
  • model(data)값이 logsoftmax취해진 값이라 exp취해줘보기…
torch.exp(model(data))[:,-1]
tensor([0.9675, 0.3238, 0.3238,  ..., 0.9937, 0.8102, 0.9736],
       grad_fn=<SelectBackward0>)
yhat_ = model(data)[:,-1][data.test_mask]
yhat_01 = yhat_ > 0.5
acc = sklearn.metrics.accuracy_score(yy,yhat_01)
pre = sklearn.metrics.precision_score(yy,yhat_01)
rec = sklearn.metrics.recall_score(yy,yhat_01)
f1 = sklearn.metrics.f1_score(yy,yhat_01)
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
yhat__ = yhat_.detach().numpy()
auc = sklearn.metrics.roc_auc_score(yy,yhat__)
acc, pre, rec, f1, auc
(0.9500499500499501, 0.0, 0.0, 0.0, 0.9889841102932585)

—> 그냥 넣었떠니 acc, auc 빼고 다 0나옴.

yyhat2 = torch.exp(model(data))[:,-1][data.test_mask]
yhat_02 = yyhat2 > 0.5
acc = sklearn.metrics.accuracy_score(yy,yhat_02)
pre = sklearn.metrics.precision_score(yy,yhat_02)
rec = sklearn.metrics.recall_score(yy,yhat_02)
f1 = sklearn.metrics.f1_score(yy,yhat_02)
#auc = sklearn.metrics.roc_auc_score(y,yyhat2)
acc, pre, rec, f1
(0.9716949716949717, 0.6477272727272727, 0.95, 0.7702702702702702)
yyhat22 = yyhat2.detach().numpy()
auc = sklearn.metrics.roc_auc_score(yy,yyhat22)
auc
0.9889841102932585
0.9889841102932585
0.9889841102932585
sklearn.metrics.roc_auc_score(yy,yyhat22)
0.9889841102932585
def evaluate(y, yhat):
    y = pd.Series(np.array(y).reshape(-1))
    yhat = pd.Series(np.array(yhat).reshape(-1))
    nan_rate = pd.Series(yhat).isna().mean()
    nan_index = pd.Series(yhat).isna()
    y = np.array(y)[~nan_index]
    yhat_prob = np.array(yhat)[~nan_index]
    yhat_01 = yhat_prob > 0.5
    acc = sklearn.metrics.accuracy_score(y,yhat_01)
    pre = sklearn.metrics.precision_score(y,yhat_01)
    rec = sklearn.metrics.recall_score(y,yhat_01)
    f1 = sklearn.metrics.f1_score(y,yhat_01)
    auc = sklearn.metrics.roc_auc_score(y,yhat_prob)
    return {'acc':acc,'pre':pre,'rec':rec,'f1':f1,'auc':auc}
y = pd.Series(np.array(yy).reshape(-1))
yhat = pd.Series(np.array(yyhat).reshape(-1))
nan_rate = pd.Series(yyhat_).isna().mean()
nan_index = pd.Series(yyhat_).isna()
y = np.array(yy)[~nan_index]
yhat_prob = np.array(yyhat_)[~nan_index]
yhat_01 = yhat_prob > 0.5
RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.
  • numpy로 하려고 하니깡 안되네으

- 그냥 기존에 돌렸떤 evaluate 값이랑 같은 듯

yyhat = train_and_evaluate_model(data, model, optimizer)
yyhat
tensor([0, 0, 0,  ..., 1, 1, 1])
evaluate(yy, yyhat)
{'acc': 0.9696969696969697,
 'pre': 0.6299559471365639,
 'rec': 0.9533333333333334,
 'f1': 0.7586206896551724,
 'auc': 0.9619453207150368}